
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% This file computes the AKM estimates   %
% code is based on CCK (2015)                   %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


path(path,'D:\Workdata\702525\MATLAB_AKM\matlab_bgl\'); %path to the matlabBGL files 

diary('I:\Workdata\702525\Alex\MA\AKM\1step_over5.log')

clear all

%LOAD DATA  
s=['I:\Workdata\702525\Alex\MA\manager\personyear_panel_1step_over5.raw'];
data=importdata(s);

%impose an initial sample restriction
year=data(:,2);
sel =(year>1990);
    
data=data(sel,:);

id=data(:,1);
year=data(:,2);
firmid=data(:,3);
managerid=data(:,4);
y=data(:,5);
age=data(:,6);
educ=data(:,7);
exp=data(:,8);
clear data

%recode education
educ(educ==1)=0;
educ(educ==2)=1;
educ(educ==3)=2;
educ(educ==4)=3;

firmid_old=firmid;
id_old=id;
managerid_old=managerid;

N=length(y); 

disp('Relabeling ids again...')
%relabel the firms
[firms,m,n]=unique(firmid);
firmid=n;

%relabel the workers
[ids,m,n]=unique(id);
id=n;

%relabel the managers
[managerids,m,n]=unique(managerid);
managerid=n;

%descriptive stats for connected set
fprintf('\n')
disp('Now restricted to the largest connected set');
s=['# of p-y obs: ' int2str(length(y))];
disp(s);
s=['# of workers: ' int2str(max(id))];
disp(s);
s=['# of firms: ' int2str(max(firmid))];
disp(s);
s=['# of managers: ' int2str(max(managerid))];
disp(s);

s=['mean wage: ' num2str(mean(y))];
disp(s)
s=['variance of wage: ' num2str(var(y))];
disp(s)
fprintf('\n')

%ESTIMATE AKM
disp('Building matrices...')
NT=length(y);
N=max(id);
J=max(firmid);
K=max(managerid);

D=sparse(1:NT,id',1);
F=sparse(1:NT,firmid',1);
G=sparse(1:NT,managerid',1);

S=speye(J-1);
S=[S;sparse(-zeros(1,J-1))];  %N+JxN+J-1 restriction matrix 

T=speye(K-1);
T=[T;sparse(-zeros(1,K-1))];  %N+KxN+K-1 restriction matrix 


E=sparse(1:NT,educ+1,1);
disp('Age tabulation')
tabulate(age)
age=(age-40)/40; %normalize to age 40 and rescale to avoid big numbers
A=[bsxfun(@times,E,age.^2),bsxfun(@times,E,age.^3)]; %age cubic by education

%Z=A;
yrmin=min(year);
R=sparse(1:NT,year-yrmin+1,1); %year effects
R(:,1)=[];

%vapp=(vapp-4000)/8000;
%V=[bsxfun(@times,E,vapp)];
%Z=[R, A, V];
Z=[R, A];

clear R E A


X=[D,F*S,G*T,Z];

disp('Running AKM...')
tic
xx=X'*X;
xy=X'*y;
L=ichol(xx,struct('type','ict','droptol',1e-2,'diagcomp',.1));
b=pcg(xx,xy,1e-10,3000,L,L');
toc
disp('Done')

%ANALYZE RESULTS
xb=X*b;
r=y-xb;

LV=inv(L);
dof=NT-N-J-K+2-size(Z,2);
var=sum(LV'.*LV,2);
var=(r'*r)/(dof-1)*var;
var=sqrt(var);

clear xx xy L LV

disp('Goodness of fit:')
dof=NT-J-N-K+2-size(Z,2)
RMSE=sqrt(sum(r.^2)/dof)
RSS=sum(r.^2);
TSS=sum((y-mean(y)).^2);
R2=1-sum(r.^2)/TSS
adjR2=1-sum(r.^2)/TSS*(NT-1)/dof

ahat=b(1:N);
ghat=b(N+1:N+J-1);
mhat=b(N+J:N+J+K-2);
mvhat=var(N+J:N+J+K-2);
mvhat=full(mvhat);
bhat=b(N+J+K-1:end);
disp('check for problems with year effects. should report zero')
sum(bhat==0)

pe=D*ahat;
fe=F*S*ghat;
me=G*T*mhat;
mevar=G*T*mvhat;
xb=X(:,N+J+K-1:end)*bhat;

clear X b

disp('Correlation of worker and firm effs');
corr(pe,fe)
disp('Correlation of worker and manager effs');
corr(pe,me)
disp('Correlation of manager and firm effs');
corr(me,fe)
disp('Means of person/firm/manager effs')
mean([pe,fe,me])

disp('Full Covariance Matrix of Components')
disp('    y      pe      fe      me      xb      r')
C=cov([y,pe,fe,me,xb,r])


%SAVE OUTPUT (this will write to the current directory)
disp('Saving main effects')
out=[id_old firmid_old managerid_old year pe fe me mevar xb];
s=['I:\Workdata\702525\Alex\MA\manager\manager_fe_1step_over5.txt'];
dlmwrite(s, out, 'delimiter', '\t', 'precision', 16);


%regress using worker and firm FE only

X=[D,F*S,Z];

disp('Running regression...')
tic
xx=X'*X;
xy=X'*y;
L=ichol(xx,struct('type','ict','droptol',1e-2,'diagcomp',.1));
b=pcg(xx,xy,1e-10,1000,L,L');
toc
disp('Done')
clear xx xy L

xb=X*b;
r2=y-xb;
disp('Goodness of fit:')
dof2=NT-N-J+1-size(Z,2)
RMSE2=sqrt(sum(r2.^2)/dof2)
RSS2=sum(r2.^2)
TSS2=sum((y-mean(y)).^2);
R22=1-sum(r2.^2)/TSS2
adjR22=1-sum(r2.^2)/TSS2*(NT-1)/dof2

%F test
F=(RSS2-RSS)/RSS/K*(dof-1)



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

disp('Match Effects Model')
dig=ceil(max(log10(firmid)));
firmiddec=firmid./(10^dig);
matchid=managerid+firmiddec;
[matchnew,m,n]=unique(matchid);
matchid=n;

M=sparse(1:NT,matchid',1);
X=[D,M,Z];
xx=X'*X;
xy=X'*y;
L=ichol(xx,struct('type','ict','droptol',1e-2,'diagcomp',.1));
b=pcg(xx,xy,1e-10,1000,L,L');
r_match=y-X*b;
dof_match=NT-size(X,2)
RMSE_match=sqrt(sum(r_match.^2)/dof_match)
R2_match=1-sum(r_match.^2)/TSS
adjR2_match=1-sum(r_match.^2)/TSS*(NT-1)/dof_match
clear Z b

%further decomposition
fprintf('\n')
disp('Further Decompositions:')
disp('Decomposing residual into match and transitory component')

xx=M'*M;
xy=M'*r;
m=M*(xx\xy);
e=r-m;
disp('Full Covariance Matrix of Components')
disp('    y      pe      fe      me      xb      m      e')
C=cov([y,pe,fe,me,xb,m,e])

fedec = ceil(10 * tiedrank(fe) / length(fe));
pedec = ceil(10 * tiedrank(me) / length(me));
for j=1:10
    for k=1:10
        p(j,k)=mean((pedec==j)&(fedec==k));
        meanr(j,k)=mean(r.*(pedec==j).*(fedec==k))/p(j,k);
    end
end
for j=1:10
    for k=1:10
        meanm(j,k)=mean(m.*(pedec==j).*(fedec==k))/p(j,k);
    end
end
disp('Mean match effect by pe/fe decile')
meanm

figure
bar3(meanm)
xlabel('Firm FE Decile')
ylabel('Manager FE Decile')
zlabel('Mean residual')



diary off;
